This notebooks aims at locally training a neural network for sentiment analysis, before deployment on Azure.
We'll compare :
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.stem import snowball
from nltk.corpus import stopwords
import spacy
import tensorflow as tf
import string
import gensim.downloader
import gensim.models
import os
import tensorflow.keras as keras
import plotly.express as px
import itertools
import keras_tuner as kt
# Load spacy model for lemmatization
spacy_model = 'en_core_web_lg'
if spacy_model not in spacy.util.get_installed_models():
!{sys.executable} -m spacy download {spacy_model}
# nltk.download('stopwords')
# enabling plots export to html
import plotly
plotly.offline.init_notebook_mode()
%%time
tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None)
tweets = tweets[[5,0]]
tweets.columns = (['text', 'sentiment'])
CPU times: user 2.17 s, sys: 186 ms, total: 2.36 s Wall time: 2.47 s
tweets
| text | sentiment | |
|---|---|---|
| 0 | @switchfoot http://twitpic.com/2y1zl - Awww, t... | 0 |
| 1 | is upset that he can't update his Facebook by ... | 0 |
| 2 | @Kenichan I dived many times for the ball. Man... | 0 |
| 3 | my whole body feels itchy and like its on fire | 0 |
| 4 | @nationwideclass no, it's not behaving at all.... | 0 |
| ... | ... | ... |
| 1599995 | Just woke up. Having no school is the best fee... | 4 |
| 1599996 | TheWDB.com - Very cool to hear old Walt interv... | 4 |
| 1599997 | Are you ready for your MoJo Makeover? Ask me f... | 4 |
| 1599998 | Happy 38th Birthday to my boo of alll time!!! ... | 4 |
| 1599999 | happy #charitytuesday @theNSPCC @SparksCharity... | 4 |
1600000 rows × 2 columns
sample, _ = train_test_split(tweets, train_size=10_000, stratify=tweets['sentiment'], random_state=42)
sample
| text | sentiment | |
|---|---|---|
| 500415 | still sitting under the dryer, my neck hurts | 0 |
| 1577236 | @sarahshah this is my nightmare (even tho i on... | 4 |
| 178111 | @mjvarela black is good... tight, or should I ... | 0 |
| 396033 | Takes forever for everybody to get ready. | 0 |
| 31962 | @Bklyncookie omg all the LA bad weather aura i... | 0 |
| ... | ... | ... |
| 1282270 | @ChristinaNewman pounds are SOO over rated! I ... | 4 |
| 436582 | I'm going to miss dancing this summer. | 0 |
| 552624 | I need a hug...and less cynicism. It's making ... | 0 |
| 443309 | ugh... i have an upset stomach...ugh ... i no ... | 0 |
| 1270172 | At noodleword ! With a couple faggets. Hahah jk | 4 |
10000 rows × 2 columns
# Get training data (80%) and validation + test data (splitted at next step)
train_set, val_test_set = train_test_split(
sample, train_size=0.8, stratify=sample['sentiment'], random_state=42)
# split val_test in validation (10%) and test (10%) set
val_set, test_set = train_test_split(
val_test_set, train_size=0.5, stratify=val_test_set['sentiment'], random_state=42)
del val_test_set
print('train set shape:', train_set.shape)
print('validation set shape:', val_set.shape)
print('test set shape:', test_set.shape)
train set shape: (8000, 2) validation set shape: (1000, 2) test set shape: (1000, 2)
class DataPreprocessor:
'''
Preprocess text according to normalization method (lemmatization, stemming
or keep original form), and optionally vectorization.
Process sentiment column into values 0 (for happy tweet) or 1 (for unhappy tweet).
Return dataframe with 1st column for sentiment and other columns for vectors.
'''
def __init__(self, normalization='lem', vectorization="word2vec"):
if normalization.lower() not in ['lem', 'stem', 'keep']:
raise ValueError('Invalid normalization method. Valid values are'\
' "lem" (Spacy lemmatization), "stem" (nltk stemming)'\
' and "keep" (no transformation).')
self.normalization = normalization
if self.normalization == 'stem':
self.stemmer = snowball.EnglishStemmer()
elif self.normalization == 'lem':
self.nlp = spacy.load(spacy_model)
self.stop_words = stopwords.words('english')
self.vec_methods = {'word2vec':'word2vec-google-news-300',
'fasttext':'fasttext-wiki-news-subwords-300',
'glove':'glove-twitter-200'}
if vectorization.lower() not in self.vec_methods:
raise ValueError('Invalid vectorization method. Valid values are', ', '.join(self.vec_methods.keys()))
self.vectorization = vectorization
model_name = self.vec_methods.get(self.vectorization)
print(f'Loading vectors for {self.vectorization} model, please wait...')
self.vectors = gensim.downloader.load(model_name)
print('Vectors loaded.')
def _normalize_text(self, input_string):
'''
Return input_string after lowering, deleting stop words / twitter user names /
punctuation / digits / multiple spaces, and stemming or lemmatization
according to self.normalization.
'''
result = input_string
twitter_pattern = "\@\S*"
punct_pattern = '[' + re.escape(string.punctuation) + '\d]'
result = re.sub(twitter_pattern, '', result)
result = re.sub(punct_pattern, ' ', result)
result = re.sub('[ ]{2,}', ' ', result)
if self.normalization == 'keep':
result = ' '.join([word for word in result.split() if word not in self.stop_words])
elif self.normalization == 'stem':
result = ' '.join([self.stemmer.stem(word) for word in result.split() if word not in self.stop_words])
elif self.normalization == 'lem':
result = ' '.join([tok.lemma_.lower() for tok in self.nlp(result) if tok.text not in self.stop_words])
return result.strip() or np.NaN
def _vectorize_sentence(self, sentence):
'''
Return the average vector of all words in the sentence.
'''
sentence_vec = np.zeros((self.vectors.vector_size,))
known_words = [word for word in sentence.split() if word in self.vectors.key_to_index]
if known_words:
sentence_vec = np.mean([self.vectors[word] for word in known_words], axis=0)
return sentence_vec
def _vectorize_dataset(self, dataframe, sentiment_col='sentiment', text_col='text'):
'''
For given dataframe with columns "text" and "sentiment", return
a dataframe with same nb of rows, with first column = "sentiment"
and next columns = embedding vector for "text".
'''
vec_df = pd.DataFrame(dataframe[text_col].apply(
self._vectorize_sentence).tolist(), index=dataframe.index)
result = pd.concat((pd.DataFrame(dataframe[sentiment_col]), vec_df), axis=1)
return result
def preprocess_dataset(self, dataframe, sentiment_col='sentiment', text_col='text', pos_label=0):
'''
Return tuple :
- Dataframe for text vectors
- Series for sentiment feature after converting the values appropriately (4--> and 0-->1)
'''
result = dataframe.copy()
result[sentiment_col] = (result[sentiment_col]==pos_label).astype(int)
result[text_col] = result[text_col].apply(self._normalize_text)
result = result.dropna()
result = self._vectorize_dataset(result, sentiment_col=sentiment_col, text_col=text_col)
return result.iloc[:,1:], result.iloc[:,:1]
#########################################
############ TESTS ##############
#########################################
dp = DataPreprocessor()
assert np.isnan(dp._normalize_text('123'))
assert dp._vectorize_sentence("cats like dogs but dogs don't like cats").shape == (300,)
print('All tests passed.')
Loading vectors for word2vec model, please wait... Vectors loaded. All tests passed.
Since for us the positive case is the case of negative/unhappy sentiment, we turn the "sentiment" column into expected values:
Text must be cleaned before vectorization. We'll remove:
The we'll apply stemming or lemmatization to enhance the model performance. We'll compare performance of both methods through the model result. Here is an example of each preprocessing method:
test_string = "@mimi2000 We, finally!: went to the shopping) 12centers! 34"
print('Test string:')
print(test_string)
print('\nPreprocessed string with lemmatization:')
print(DataPreprocessor(normalization='lem')._normalize_text(test_string))
print('\nPreprocessed string with stemming:')
print(DataPreprocessor(normalization='stem')._normalize_text(test_string))
print('\nPreprocessed string with no stemming/lemmaization:')
print(DataPreprocessor(normalization='keep')._normalize_text(test_string))
Test string: @mimi2000 We, finally!: went to the shopping) 12centers! 34 Preprocessed string with lemmatization: Loading vectors for word2vec model, please wait... Vectors loaded. we finally go shopping center Preprocessed string with stemming: Loading vectors for word2vec model, please wait... Vectors loaded. we final went shop center Preprocessed string with no stemming/lemmaization: Loading vectors for word2vec model, please wait... Vectors loaded. We finally went shopping centers
For our first try, we'll use pre-trained Word2vec English model from Gensim.
dp.vectorization
'word2vec'
dp.vectors.similar_by_word('cat')
[('cats', 0.8099379539489746),
('dog', 0.7609456181526184),
('kitten', 0.7464985251426697),
('feline', 0.7326233983039856),
('beagle', 0.7150582671165466),
('puppy', 0.7075453400611877),
('pup', 0.6934289932250977),
('pet', 0.6891530752182007),
('felines', 0.6755931377410889),
('chihuahua', 0.6709762811660767)]
dp.vectors.similar_by_word('dog')
[('dogs', 0.8680490851402283),
('puppy', 0.8106428384780884),
('pit_bull', 0.780396044254303),
('pooch', 0.7627375721931458),
('cat', 0.7609457969665527),
('golden_retriever', 0.7500901222229004),
('German_shepherd', 0.7465173006057739),
('Rottweiler', 0.7437615990638733),
('beagle', 0.7418619990348816),
('pup', 0.7406911253929138)]
To embed whole sentences, we'll average the vectors of each word.
Our function is ready to preprocess each dataset:
X_train, y_train = dp.preprocess_dataset(train_set)
X_val, y_val = dp.preprocess_dataset(val_set)
X_test, y_test = dp.preprocess_dataset(test_set)
Now that we have cleaned the data, we can create the model:
# Create model
def build_model(activation='tanh'):
nb_hidden_layers = 3
nb_units = 128
dropout_rate = 0.1
learning_rate = 0.01
initializers = {'tanh': keras.initializers.glorot_normal,
'relu': keras.initializers.he_normal,
'selu': keras.initializers.lecun_normal,}
initializer = initializers.get(activation)
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1])))
for _ in range(nb_hidden_layers):
model.add(keras.layers.Dense(nb_units, activation=activation, kernel_initializer=initializer))
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss = "binary_crossentropy",
optimizer=keras.optimizers.SGD(learning_rate=learning_rate),
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
# Create custom function for Tensorboards logfiles
root_logdir = os.path.join(os.curdir, "my_logs")
def get_run_logdir():
import time
run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
return os.path.join(root_logdir, run_id)
run_logdir = get_run_logdir()
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
model = build_model()
# Fit model
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
verbose=0, callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
px.line(pd.DataFrame(history.history), labels={'index': 'epochs'})
EarlyStopping worked as expected, by stopping the training before the 50 parametered epochs to avoid overfitting.
_, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)
print(f'On the test set, the model reaches an accuracy of {test_accuracy:.2%},'\
f' a precision of {test_precision:.2%} and a recall of {test_recall:.2%}.')
On the test set, the model reaches an accuracy of 72.52%, a precision of 75.81% and a recall of 66.06%.
The recall oscillates a lot, maybe another activation function could fit better?
for activation_function in ['relu', 'selu']:
model = build_model(activation_function)
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
verbose=0,
callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)])
fig = px.line(pd.DataFrame(history.history), labels={'index': 'epochs'},
title=f'With {activation_function} activation')
fig.show()
_, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)
print(f'On the test set, the model reaches an accuracy of {test_accuracy:.2%},'\
f' a precision of {test_precision:.2%} and a recall of {test_recall:.2%}.')
On the test set, the model reaches an accuracy of 74.42%, a precision of 75.26% and a recall of 72.69%.
On the test set, the model reaches an accuracy of 70.11%, a precision of 66.23% and a recall of 81.93%.
We notice that the model converges muche faster with SELU activation function.
We used lemmtization and Word2vec embedding. Let's compare with other normalizing and embedding methods.
sample_size = 10_000
# Get sample
sample, _ = train_test_split(tweets, train_size=sample_size, stratify=tweets['sentiment'], random_state=42)
# Get training data (80%) and validation + test data (splitted at next step)
train_set, val_test_set = train_test_split(
sample, train_size=0.8, stratify=sample['sentiment'], random_state=42)
# split val_test in validation (10%) and test (10%) set
val_set, test_set = train_test_split(
val_test_set, train_size=0.5, stratify=val_test_set['sentiment'], random_state=42)
del val_test_set
results = pd.DataFrame(columns=['normalization', 'vectorization', 'accuracy', 'precision', 'recall', 'f2 score'])
preproc_params = ['stem', 'lem']
vecto_params = ['word2vec', 'glove', 'fasttext']
params = [tup for tup in itertools.product(preproc_params, vecto_params)]
for normalization, vectorization in tqdm(params):
# preprocess datasets
print(f'Preprocessing datasets with {normalization} and {vectorization}')
dp = DataPreprocessor(normalization=normalization, vectorization=vectorization)
X_train, y_train = dp.preprocess_dataset(train_set)
X_val, y_val = dp.preprocess_dataset(val_set)
X_test, y_test = dp.preprocess_dataset(test_set)
# Create new model
model = build_model('selu')
# Fit model
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val),
callbacks=[tensorboard_cb,
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)],
verbose=0)
print('Model fitted')
# save results
_, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
f2_score = 5 * precision * recall / ((4 * precision)+ recall)
row = pd.Series(dict(zip(results.columns, [normalization,
vectorization,
accuracy,
precision,
recall,
f2_score
])))
results = results.append(row, ignore_index=True)
Preprocessing datasets with stem and word2vec Loading vectors for word2vec model, please wait... Vectors loaded. Model fitted Preprocessing datasets with stem and glove Loading vectors for glove model, please wait... Vectors loaded. Model fitted Preprocessing datasets with stem and fasttext Loading vectors for fasttext model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and word2vec Loading vectors for word2vec model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and glove Loading vectors for glove model, please wait... Vectors loaded. Model fitted Preprocessing datasets with lem and fasttext Loading vectors for fasttext model, please wait... Vectors loaded. Model fitted
results.sort_values(by='f2 score').style.format(dict.fromkeys(results.columns[2:], '{:.2%}'))
| normalization | vectorization | accuracy | precision | recall | f2 score | |
|---|---|---|---|---|---|---|
| 4 | lem | glove | 74.72% | 78.47% | 68.07% | 69.93% |
| 2 | stem | fasttext | 72.52% | 73.14% | 71.08% | 71.49% |
| 3 | lem | word2vec | 73.32% | 73.20% | 73.49% | 73.43% |
| 0 | stem | word2vec | 72.82% | 70.83% | 77.51% | 76.07% |
| 5 | lem | fasttext | 72.42% | 70.09% | 78.11% | 76.36% |
| 1 | stem | glove | 75.23% | 73.46% | 78.92% | 77.76% |
The lemmatization with glove or fasttext vectorization seems to be the best combination, we'll use it for next steps.
dp = DataPreprocessor('lem', 'fasttext')
Loading vectors for fasttext model, please wait... Vectors loaded.
X_train, y_train = dp.preprocess_dataset(train_set)
X_val, y_val = dp.preprocess_dataset(val_set)
X_test, y_test = dp.preprocess_dataset(test_set)
Since the recall is rather wobbly, we won't monitor it for tuning hyperparameters, but val_loss instead.
def build_model(hp):
nb_hidden_layers = hp.Choice('nb_hidden_layers', values=[1, 2, 3, 4, 5, 6])
nb_units = hp.Choice('nb_units', values = [8,16,32,64,128])
initializers = {'tanh': keras.initializers.glorot_normal,
'relu': keras.initializers.he_normal,
'selu': keras.initializers.lecun_normal,}
activation = hp.Choice('activation', values=list(initializers.keys()))
dropout = hp.Boolean('dropout')
dropout_rate = hp.Float("dropout_rate", min_value=0.1, max_value=0.5)
learning_rate = hp.Float("learning_rate", min_value=0.0001, max_value=0.1, sampling='log')
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1])))
for _ in range(nb_hidden_layers):
model.add(keras.layers.Dense(nb_units, activation=activation,
kernel_initializer=initializers[activation]))
if dropout:
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss = "binary_crossentropy",
optimizer=keras.optimizers.SGD(learning_rate=learning_rate),
metrics=['accuracy',
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall')])
return model
tuner = kt.BayesianOptimization(hypermodel=build_model,
objective=kt.Objective("val_loss",direction='min'),
max_trials=50,
overwrite=True,
directory='my_dir',
project_name='essai')
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(X_train, y_train, validation_data=(X_val, y_val), callbacks=[stop_early],)
Trial 50 Complete [00h 00m 01s] val_loss: 0.627572774887085 Best val_loss So Far: 0.598247230052948 Total elapsed time: 00h 02m 05s INFO:tensorflow:Oracle triggered exit
Now that the tuner has found good parameters, we can use them in our model:
print('Best parameters:')
for key, value in tuner.get_best_hyperparameters()[0].values.items():
print(key, ':', value)
model = build_model(tuner.get_best_hyperparameters()[0])
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=35, verbose=0)
px.line(history.history).show()
_, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)
print(f'On the test set, the model reaches an accuracy of {test_accuracy:.2%},'\
f' a precision of {test_precision:.2%} and a recall of {test_recall:.2%}.')
Best parameters: nb_hidden_layers : 6 nb_units : 128 activation : selu dropout : False dropout_rate : 0.5 learning_rate : 0.1
On the test set, the model reaches an accuracy of 71.01%, a precision of 67.05% and a recall of 82.53%.
The model gives different results on each session:
for _ in range(4):
model = build_model(tuner.get_best_hyperparameters()[0])
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=35, verbose=0)
px.line(history.history).show()
_, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)
print(f'On the test set, the model reaches an accuracy of {test_accuracy:.2%},'\
f' a precision of {test_precision:.2%} and a recall of {test_recall:.2%}.')
On the test set, the model reaches an accuracy of 66.80%, a precision of 61.74% and a recall of 88.15%.
On the test set, the model reaches an accuracy of 72.32%, a precision of 69.61% and a recall of 79.12%.
On the test set, the model reaches an accuracy of 72.12%, a precision of 68.71% and a recall of 81.12%.
On the test set, the model reaches an accuracy of 74.12%, a precision of 80.61% and a recall of 63.45%.
But globally we see that the accuracy does not got beyond about 75%.